# Direct outputs not eligible for public release

# GLOBAL SETTINGS --------------------------------------------------------------

options(
  scipen = 999,
  digits = 16,
  max.print = .Machine$integer.max,
  show.error.locations = TRUE,
  warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------

library(data.table)

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# BEGIN FILE -------------------------------------------------------------------

years <- 1999:2016
num_cores <- length(years)

DoCalcs <- function(year) {

    print(sprintf("Started calcs for year='%s'", year))

    # Read in per-adult population data
    data <- fread(
        sprintf("~/population-annual-data/per-adult-tax-yr-%s.csv", year),
        select =
            c(
                "tin", # de-identified unique ID for a person
                "f1040_adjgross", # AGI, per-adult
                "age",
                "gender",
                "married",
                "db_w2_wages", # W2 wages (individually-reported)
                "db_homeowner" # homeownership indicator (individually-reported)
            )
    )

    # Constrain to those with age information, and between 21 and 64
    data <- data[!is.na(age) & between(age, 21, 64, incbounds = TRUE)]

    # Constrain to tax filers
    data[, filer := as.integer(!is.na(f1040_adjgross))]
    data <- data[filer == 1]

    # Consistently replace missings with 0 where relevant
    data[is.na(f1040_adjgross), f1040_adjgross := 0]
    data[is.na(db_w2_wages), db_w2_wages := 0]
    data[is.na(db_homeowner), db_homeowner := 0]

    # Generate two characteristics for table
    data[, female := as.integer(gender == "F")]

    data[is.na(married), married := 0]
    setnames(data, "married", "married_ch") # just to match lottery data

    # Make a paid employment indicator
    data[, has_w2_wages := as.integer(db_w2_wages > 0)]

    summary_table_overall_filers <-
        data[
            ,
            list(
                mean_female = mean(female),
                mean_age = mean(age),
                mean_wages = mean(db_w2_wages),
                mean_employment = mean(has_w2_wages),
                mean_married = mean(married_ch),
                mean_home = mean(db_homeowner),
                count = .N
            )
        ]

    summary_table_overall_filers[, tax_yr := as.integer(year)]

    # Calculate quantiles of the AGI distribution for this tax year
    coarse_grid <- (seq.int(from = 5, to = 95, by = 5)) / 100
    coarse_quantile_evals_all <-
        data.table(
            quantile = coarse_grid,
            value = quantile(data$f1040_adjgross, probs = coarse_grid),
            count = dim(data)[1]
        )

    # Housekeeping
    data <- NULL
    rm(data)

    # Read in stacked estimation sample data (2 years prior to win)
    ES_data <-
        readRDS("~/population-panel-data/stacked-lottery-data-omitted-year.rds")

    ES_data[, population_quantile := as.numeric(NA)]
    for (q in coarse_grid) {
        ES_data[
            per_adult_adjgross >= coarse_quantile_evals_all[quantile == q]$value,
            population_quantile := q
        ]
    }
    ES_data[is.na(population_quantile), population_quantile := 0]

    quantile_summary <-
        ES_data[
            ,
            .(
                count = .N,
                total = dim(ES_data)[1]
            ),
            by = .(population_quantile)
        ][order(population_quantile)]

    quantile_summary[, tax_yr := as.integer(year)]

    # Housekeeping
    ES_data <- NULL
    rm(ES_data)

    output <- list()
    output[[1]] <- summary_table_overall_filers
    output[[2]] <- quantile_summary

    saveRDS(
        output,
        sprintf(
            "~/summary-output/population-stats-and-quantile-shares-%s.rds",
            year
        )
    )

    warning()
    return(year)
}

if (is.null(num_cores)) {
    num_cores <- 1
}
print(paste("num_cores", num_cores))
if (num_cores > 1) {
    parallel::mclapply(years, DoCalcs, mc.silent = FALSE, mc.cores = num_cores)
} else {
    lapply(years, DoCalcs)
}

print(warnings())
